/* * Copyright 2011 OverZealous Creations, LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.overzealous.remark.convert; import com.overzealous.remark.IgnoredHtmlElement; import com.overzealous.remark.Options; import com.overzealous.remark.util.BlockWriter; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; import java.io.OutputStream; import java.io.Writer; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * The class that does the heavy lifting for converting a JSoup Document into * valid Markdown * * @author Phil DeJarnett */ public class DocumentConverter { // These properties do not change for the life of this converter final Options options; final TextCleaner cleaner; private final Set<String> ignoredHtmlTags; final Map<String,NodeHandler> blockNodes; final Map<String,NodeHandler> inlineNodes; // These properties change for each conversion private Map<String,String> linkUrls; // for looking up links via URL private int genericLinkUrlCounter; private int genericImageUrlCounter; private Map<String,String> linkIds; // an inverse of linkUrls, for looking up links via ID private Map<String,String> abbreviations; // a cache of abbreviations mapped by abbreviated form BlockWriter output; // the output writer, which may change during recursion private Map<String,NodeHandler> lastNodeset; private static final Pattern COMMA = Pattern.compile(","); private static final Pattern LINK_MULTIPLE_SPACES = Pattern.compile(" {2,}", Pattern.DOTALL); private static final Pattern LINK_SAFE_CHARS = Pattern.compile("[^-\\w \\.]+", Pattern.DOTALL); private static final String LINK_REPLACEMENT = "_"; private static final Pattern LINK_EDGE_REPLACE = Pattern.compile(String.format("(^%1$s++)|(%1$s++$)", LINK_REPLACEMENT)); private static final Pattern LINK_MULTIPLE_REPLACE = Pattern.compile(String.format("%1$s{2,}", LINK_REPLACEMENT)); private static final Pattern LINK_FILENAME = Pattern.compile("/([^/]++)$"); /** * Creates a DocumentConverted with the given options. * @param options Options for this converter. */ public DocumentConverter(Options options) { // configure final properties this.options = options; cleaner = new TextCleaner(options); ignoredHtmlTags = new HashSet<String>(); blockNodes = new HashMap<String, NodeHandler>(); inlineNodes = new HashMap<String, NodeHandler>(); // configure ignored tags for(final IgnoredHtmlElement ihe : options.getIgnoredHtmlElements()) { ignoredHtmlTags.add(ihe.getTagName()); } configureNodes(); } private void configureNodes() { addInlineNode(new InlineStyle(), "i,em,b,strong,font,span,del,strike,s"); addInlineNode(new InlineCode(), "code,tt"); addInlineNode(new Image(), "img"); addInlineNode(new Anchor(), "a"); addInlineNode(new Break(), "br"); addBlockNode (new Header(), "h1,h2,h3,h4,h5,h6"); addBlockNode (new Paragraph(), "p"); addBlockNode (new Codeblock(), "pre"); addBlockNode (new BlockQuote(), "blockquote"); addBlockNode (new HorizontalRule(), "hr"); addBlockNode (new List(), "ol,ul"); if(options.abbreviations) { addInlineNode(new Abbr(), "abbr,acronym"); } if(options.definitionLists) { addBlockNode(new Definitions(), "dl"); } // TABLES if(options.getTables().isConvertedToText()) { // if we are going to process it, add the handler addBlockNode(new Table(), "table"); } else if(options.getTables().isRemoved()) { addBlockNode(NodeRemover.getInstance(), "table"); } // else, it's being added directly } @SuppressWarnings({"UnusedDeclaration"}) public Options getOptions() { return options; } @SuppressWarnings({"UnusedDeclaration"}) public TextCleaner getCleaner() { return cleaner; } @SuppressWarnings({"UnusedDeclaration"}) public Map<String, NodeHandler> getBlockNodes() { return Collections.unmodifiableMap(blockNodes); } @SuppressWarnings({"UnusedDeclaration"}) public Map<String, NodeHandler> getInlineNodes() { return Collections.unmodifiableMap(inlineNodes); } @SuppressWarnings({"UnusedDeclaration"}) public BlockWriter getOutput() { return output; } @SuppressWarnings({"UnusedDeclaration"}) public void setOutput(BlockWriter output) { this.output = output; } /** * Customize the processing for a node. This node is added to the * inline list and the block list. The inline list is used for nodes * that do not contain linebreaks, such as {@code <em>} or {@code <strong>}. * * The tagnames is a comma-delimited list of tagnames for * which this handler should be applied. * * @param handler The handler for the nodes * @param tagnames One or more tagnames */ @SuppressWarnings({"WeakerAccess"}) public void addInlineNode(NodeHandler handler, String tagnames) { for(final String key : COMMA.split(tagnames)) { if(key.length() > 0) { inlineNodes.put(key, handler); blockNodes.put(key, handler); } } } /** * Customize the processing for a node. This node is added to the * block list only. The node handler should properly use the * {@link com.overzealous.remark.util.BlockWriter#startBlock()} and * {@link com.overzealous.remark.util.BlockWriter#endBlock()} methods as * appropriate. * * The tagnames is a comma-delimited list of tagnames for * which this handler should be applied. * * @param handler The handler for the nodes * @param tagnames One or more tagnames */ @SuppressWarnings({"WeakerAccess"}) public void addBlockNode(NodeHandler handler, String tagnames) { for(final String key : COMMA.split(tagnames)) { if(key.length() > 0) { blockNodes.put(key, handler); } } } /** * Convert a document to the given writer. * * <p><strong>Note: It is up to the calling class to handle closing the writer!</strong></p> * * @param doc Document to convert * @param out Writer to receive the final output */ public void convert(Document doc, Writer out) { this.output = new BlockWriter(out, true); this.convertImpl(doc); } /** * Convert a document to the given output stream. * * <p><strong>Note: It is up to the calling class to handle closing the stream!</strong></p> * * @param doc Document to convert * @param out OutputStream to receive the final output */ public void convert(Document doc, OutputStream out) { this.output = new BlockWriter(out, true); this.convertImpl(doc); } /** * Convert a document and return a string. * When wanting a final string, this method should always be used. * It will attempt to calculate the size of the buffer necessary to hold the entire output. * * @param doc Document to convert * @return The Markdown-formatted string. */ public String convert(Document doc) { // estimate the size necessary to handle the final output BlockWriter bw = BlockWriter.create(DocumentConverter.calculateLength(doc, 0)); this.output = bw; this.convertImpl(doc); return bw.toString(); } // Utility method to quickly walk the DOM tree and estimate the size of the // buffer necessary to hold the result. private static int calculateLength(Element el, int depth) { int result = 0; for(final Node n : el.childNodes()) { if(n instanceof Element) { result += (4 * depth) + calculateLength((Element)n, depth+1); } else if(n instanceof TextNode) { result += ((TextNode)n).text().length(); } } return result; } // implementation of the convert method. Basically handles setting up the private void convertImpl(Document doc) { // linked, because we want the resulting list of links in order they were added linkIds = new LinkedHashMap<String, String>(); // To keep track of already added URLs linkUrls = new HashMap<String, String>(); genericImageUrlCounter = 0; genericLinkUrlCounter = 0; // linked, to keep abbreviations in the order they were added abbreviations = new LinkedHashMap<String, String>(); lastNodeset = blockNodes; // walk the DOM walkNodes(DefaultNodeHandler.getInstance(), doc.body(), blockNodes); if(!linkIds.isEmpty()) { // Add links output.startBlock(); for(final Map.Entry<String,String> link : linkIds.entrySet()) { output.printf("\n[%s]: %s", link.getKey(), link.getValue()); } output.endBlock(); } if(!abbreviations.isEmpty()) { // Add abbreviations output.startBlock(); for(final Map.Entry<String,String> abbr : abbreviations.entrySet()) { output.printf("\n*[%s]: %s", abbr.getKey(), cleaner.clean(abbr.getValue())); } output.endBlock(); } // free up unused properties linkIds = null; linkUrls = null; abbreviations = null; output = null; } /** * Loops over the children of an HTML Element, handling TextNode and child Elements. * * @param currentNode The default node handler for TextNodes and IgnoredHTMLElements. * @param el The parent HTML Element whose children are being looked at. */ public void walkNodes(NodeHandler currentNode, Element el) { walkNodes(currentNode, el, lastNodeset); } /** * Loops over the children of an HTML Element, handling TextNode and child Elements. * * @param currentNodeHandler The default node handler for TextNodes and IgnoredHTMLElements. * @param el The parent HTML Element whose children are being looked at. * @param nodeList The list of valid nodes at this level. Should be one of <b>blockNodes</b> or <b>inlineNodes</b> */ public void walkNodes(NodeHandler currentNodeHandler, Element el, Map<String, NodeHandler> nodeList) { Map<String, NodeHandler> backupLastNodeset = lastNodeset; lastNodeset = nodeList; for(final Node n : el.childNodes()) { if(n instanceof TextNode) { // It's just text! currentNodeHandler.handleTextNode((TextNode) n, this); } else if(n instanceof Element) { // figure out who can handle this Element node = (Element)n; String tagName = node.tagName(); if(nodeList.containsKey(tagName)) { // OK, we know how to handle this node nodeList.get(tagName).handleNode(currentNodeHandler, node, this); } else if(ignoredHtmlTags.contains(tagName)) { // User wants to leave this tag in the output. Naughty user. currentNodeHandler.handleIgnoredHTMLElement(node, this); } else { // No-can-do, just remove the node, and keep on walkin' // The only thing we'll do is add block status in if the unknown node // usually renders as a block. // Due to BlockWriter's intelligent tracking, we shouldn't get a whole bunch // of empty lines for empty nodes. if(node.isBlock()) { output.startBlock(); } walkNodes(currentNodeHandler, node, nodeList); if(node.isBlock()) { output.endBlock(); } } } // else: not a node we care about (e.g.: comment nodes) } lastNodeset = backupLastNodeset; } /** * Recursively processes child nodes and returns the potential output string. * @param currentNode The default node handler for TextNodes and IgnoredHTMLElements. * @param el The parent HTML Element whose children are being looked at. * @return The potential output string. */ public String getInlineContent(NodeHandler currentNode, Element el) { return this.getInlineContent(currentNode, el, false); } /** * Recursively processes child nodes and returns the potential output string. * @param currentNode The default node handler for TextNodes and IgnoredHTMLElements. * @param el The parent HTML Element whose children are being looked at. * @param undoLeadingEscapes If true, leading escapes are removed * @return The potential output string. */ public String getInlineContent(NodeHandler currentNode, Element el, boolean undoLeadingEscapes) { BlockWriter oldOutput = output; output = BlockWriter.create(1000); walkNodes(currentNode, el, inlineNodes); String ret = output.toString(); output = oldOutput; if(undoLeadingEscapes) { ret = cleaner.unescapeLeadingCharacters(ret); } return ret; } /** * Adds a link to the link set, and returns the actual ID for the link. * * @param url URL for link * @param recommendedName A recommended name for non-simple link IDs. This might be modified. * @param image If true, use "img-" instead of "link-" for simple link IDs. * @return The actual link ID for this URL. */ public String addLink(String url, String recommendedName, boolean image) { String linkId; if(linkUrls.containsKey(url)) { linkId = linkUrls.get(url); } else { if(options.simpleLinkIds) { linkId = (image ? "image-" : "") + String.valueOf(linkUrls.size()+1); } else { recommendedName = cleanLinkId(url, recommendedName, image); if(linkIds.containsKey(recommendedName)) { int incr = 1; while(linkIds.containsKey(String.format("%s %d", recommendedName, incr))) { incr++; } recommendedName = String.format("%s %d", recommendedName, incr); } linkId = recommendedName; } linkUrls.put(url, linkId); linkIds.put(linkId, url); } return linkId; } /** * Adds an abbreviation to the abbreviation set. * @param abbr The abbreviation to be used * @param definition The definition for the abbreviation, should NOT be pre-escaped. */ void addAbbreviation(String abbr, String definition) { if(!abbreviations.containsKey(abbr)) { abbreviations.put(abbr, definition); } } String cleanLinkId(String url, String linkId, boolean image) { // no newlines String ret = linkId.replace('\n', ' '); // multiple spaces should be a single space ret = LINK_MULTIPLE_SPACES.matcher(ret).replaceAll(" "); // remove all characters except letters, numbers, spaces, and some basic punctuation ret = LINK_SAFE_CHARS.matcher(ret).replaceAll(LINK_REPLACEMENT); // replace multiple underscores with a single underscore ret = LINK_MULTIPLE_REPLACE.matcher(ret).replaceAll(LINK_REPLACEMENT); // replace underscores on the left or right with nothing ret = LINK_EDGE_REPLACE.matcher(ret).replaceAll(""); // trim any leading or trailing spaces ret = ret.trim(); if(ret.length() == 0 || ret.equals(LINK_REPLACEMENT)) { // if we have nothing usable left, use a generic ID if(image) { if(url != null) { Matcher m = LINK_FILENAME.matcher(url); if(m.find()) { ret = cleanLinkId(null, m.group(1), true); } else { genericImageUrlCounter++; ret = "Image " + genericImageUrlCounter; } } else { genericImageUrlCounter++; ret = "Image " + genericImageUrlCounter; } } else { genericLinkUrlCounter++; ret = "Link " + genericLinkUrlCounter; } } // else, use the cleaned id return ret; } }